Registry Outliers
Differences in birth data
Compare birth data by residential address vs first vaccination location
T-test at both province and commune level suggests a difference in birth count computed by residential address and first vaccination location
Binh Dinh are excluded before testing since we don’t have any data where vaccination location is Binh Dinh
Code
# aggregate to birth data per year at commune level
aggregated_birth <- birth_data %>%
group_by(VID_1, VID_3, year) %>%
summarize(
nborn_doc = sum(nborn_doc, na.rm = TRUE),
nborn_vac = sum(nborn_vac, na.rm = TRUE)
) %>%
ungroup() %>%
left_join(
location %>%
select(VID_3,province, district, commune) %>% unique(),
by = join_by(VID_3 == VID_3)
) Compare birth count at commune level
Code
# histogram for birth count difference
# ggplotly(
# ggplot(data = aggregated_birth) +
# geom_histogram(
# aes(x = abs(nborn_doc - nborn_vac)), fill = "blue", alpha = 0.5
# )
# )
# compare birth data from vaccination vs residential address
# t-test suggest significant difference
# exclude Binh Dinh before comparison
with(aggregated_birth, {
# log10 due to right skewed data
t.test(log10(nborn_doc[!(VID_1 == 52)] + 1) - log10(nborn_vac[!(VID_1 == 52)] + 1))
})
One Sample t-test
data: log10(nborn_doc[!(VID_1 == 52)] + 1) - log10(nborn_vac[!(VID_1 == 52)] + 1)
t = 71.669, df = 93950, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
0.01605296 0.01695567
sample estimates:
mean of x
0.01650432
Compare birth count at province level
Code
# exclude Binh Dinh before comparison
with(aggregated_birth %>%
group_by(VID_1, year) %>%
summarize(nborn_doc = sum(nborn_doc, na.rm = TRUE),
nborn_vac = sum(nborn_vac, na.rm = TRUE)), {
# log10 due to right skewed data
t.test(log10(nborn_doc[!(VID_1 == 52)] + 1) - log10(nborn_vac[!(VID_1 == 52)] + 1))
})`summarise()` has grouped output by 'VID_1'. You can override using the
`.groups` argument.
One Sample t-test
data: log10(nborn_doc[!(VID_1 == 52)] + 1) - log10(nborn_vac[!(VID_1 == 52)] + 1)
t = 3.6972, df = 557, p-value = 0.0002395
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
0.01344436 0.04392114
sample estimates:
mean of x
0.02868275
Geo plot for birth data
Visualize difference in number of births computed by residential address (birth count by doc) or first location of vaccination (birth count by vac)
\(Ratio = \frac{\text{Birth count by vac}}{\text{Birth count by doc}}\)
- Birth data for Binh Dinh computed by vac is 0 for all years since registry does not have data with registered address (nborn_vac) from Binh Dinh
- The birth count ratio for Binh Duong is much higher relative to other provinces
Click on each province to view changes in birth count for that district over time
Birth count by province over the year
Code
ggplotly(
birth_data %>%
filter(year > 2014) %>%
group_by(VID_1, year) %>%
summarize(
nborn = sum(nborn_vac, na.rm=TRUE)
) %>%
left_join(
location %>% select(VID_1, province) %>% unique(),
by = join_by(VID_1 == VID_1)) %>%
ggplot() +
geom_line(aes(x = year, y = nborn, color = province))
)Code
ggplotly(
birth_data %>%
filter(year > 2014) %>%
group_by(VID_1, year) %>%
summarize(
nborn = sum(nborn_doc, na.rm=TRUE)
) %>%
left_join(
location %>% select(VID_1, province) %>% unique(),
by = join_by(VID_1 == VID_1)) %>%
ggplot() +
geom_line(aes(x = year, y = nborn, color = province))
)10,557 communes where there were at least 1 year where birth count = 0
Code
commune_w_no_birth <- birth_data %>%
left_join(
location %>% select(VID_3, province, district, commune) %>% unique(),
by = join_by(VID_3 == VID_3)
) %>%
filter(
nborn_doc == 0 | nborn_vac == 0
) %>%
select(VID_3, province, district, commune) %>% unique()
commune_w_no_birthCode
download_this(
commune_w_no_birth,
button_label = "Download communes with at least 1 year with 0 birth count",
output_name = "communes_no_birth",
output_extension = ".rds",
button_type = "primary",
has_icon = TRUE,
icon = "fa fa-save"
)Compare with population from Worldpop
Compute ratio with data estimated by world pop (\(\frac{\text{birth count per commune per year}}{\text{WorldPop esimation for population for 0-12m age group}}\) ) to identify communes with low birth counts compared to WorldPop’s estimation
Record is consider outlier if computed ratio < 20% quantile of all computed ratio
Code
potential_outlier_birth_count <- birth_data %>%
filter(year > 2014, year < 2022) %>%
group_by(VID_3, year) %>%
summarize(
nborn_vac = sum(nborn_vac, na.rm = TRUE),
nborn_doc = sum(nborn_doc, na.rm = TRUE)
) %>%
ungroup() %>%
left_join(
population_data %>%
select(VID_3, year, age_00) %>%
mutate(year = as.numeric(year)) %>% unique(),
by = join_by(VID_3 == VID_3, year == year)
) %>%
left_join(
location %>% select(VID_3, province, district, commune) %>% unique(),
by = join_by(VID_3 == VID_3)
) %>%
filter(
# filter out instances where nborn is 0
nborn_vac > 0 & nborn_doc > 0,
# filter out instances where count in worlpop < count in registry data as those are likely
# due to underestimation
(age_00 > nborn_vac)|(age_00 > nborn_doc)
) %>%
mutate(
vac_worldpop_ratio = nborn_vac/age_00,
doc_worldpop_ratio = nborn_doc/age_00
) %>% filter(
vac_worldpop_ratio < quantile(vac_worldpop_ratio, 0.2)
) %>%
arrange(nborn_vac, nborn_doc)
potential_outlier_birth_countCode
download_this(
potential_outlier_birth_count,
button_label = "Download data table",
output_name = "potential_outlier_birth_count",
output_extension = ".rds",
button_type = "primary",
has_icon = TRUE,
icon = "fa fa-save"
)Compare measles vaccination count computed by different type of addresses
Comparing nshot between loctype == "vac" (nshot computed by first vaccination place) loctype == "ignore" (nshot computed by the address of vaccination registration) and loctype == "doc" (nshot computed by residential address)
Both t-test and Wilcoxon rank sum test suggests similarity between nshot with loctype == "vac" and loctype == "ignore" while nshot computed with loctype == "doc" is slightly different
Code
loctypes <- c("doc", "ignore", "vac")
loctype_pairs <- list(c(1,2), c(2,3), c(1,3))
for (idx_pair in loctype_pairs){
print(paste0(">> Check ", paste(loctypes[idx_pair], collapse = ", "), " pair"))
# compare using t test
# log2 due to rightly skewed data
print(t.test(log2(nshot) ~ loctype, pairwise = TRUE, data = shot_by_commune, subset = loctype %in% loctypes[idx_pair]))
# compare using wilcoxon ranking test
print(wilcox.test(nshot ~ loctype, pairwise = TRUE, data = shot_by_commune, subset = loctype %in% loctypes[idx_pair]))
}[1] ">> Check doc, ignore pair"
Welch Two Sample t-test
data: log2(nshot) by loctype
t = -9.3337, df = 170216, p-value < 2.2e-16
alternative hypothesis: true difference in means between group doc and group ignore is not equal to 0
95 percent confidence interval:
-0.10631878 -0.06941615
sample estimates:
mean in group doc mean in group ignore
6.899471 6.987339
Wilcoxon rank sum test with continuity correction
data: nshot by loctype
W = 3600392303, p-value = 0.004477
alternative hypothesis: true location shift is not equal to 0
[1] ">> Check ignore, vac pair"
Welch Two Sample t-test
data: log2(nshot) by loctype
t = 1.0594, df = 168256, p-value = 0.2894
alternative hypothesis: true difference in means between group ignore and group vac is not equal to 0
95 percent confidence interval:
-0.008308821 0.027857521
sample estimates:
mean in group ignore mean in group vac
6.987339 6.977564
Wilcoxon rank sum test with continuity correction
data: nshot by loctype
W = 3543314992, p-value = 0.678
alternative hypothesis: true location shift is not equal to 0
[1] ">> Check doc, vac pair"
Welch Two Sample t-test
data: log2(nshot) by loctype
t = -8.2649, df = 170428, p-value < 2.2e-16
alternative hypothesis: true difference in means between group doc and group vac is not equal to 0
95 percent confidence interval:
-0.09661251 -0.05957372
sample estimates:
mean in group doc mean in group vac
6.899471 6.977564
Wilcoxon rank sum test with continuity correction
data: nshot by loctype
W = 3610756878, p-value = 0.01542
alternative hypothesis: true location shift is not equal to 0
Geo plot for comparison
\(Ratio = \frac{\text{Vaccount per province by vac}}{\text{Vaccount per province by doc}}\)
Warning in pal(vac_doc_ratio): Some values were outside the color scale and
will be treated as NA
\(Ratio = \frac{\text{Vaccount per district by vac}}{\text{Vaccount per district by doc}}\)
Warning in pal(vac_doc_ratio): Some values were outside the color scale and
will be treated as NA
Identify potential outliers in vaccination count
Vaccount computed by loctype == "ignore" is used for outlier detection
Detect potential vaccination count outliers for communes
Identify communes that have low vaccination count relative to the vaccination count of other communes in the same district
Metrics to identify outlier communes for each district is as followed
Based on mean: \(\text{std\_diff\_fr\_mean} = \frac{\text{vaccount\_commune - mean\_vaccount}}{\text{sd\_vaccount}}\)
vaccount_communeis vaccination count for each communemean_vaccountis mean vaccination count per commune for the corresponding districtsd_vaccountis standard deviation for vaccination per commune for the corresponding district
Based on median: \(\text{std\_diff\_fr\_median} = \frac{\text{vaccount\_commune - median\_vaccount}}{\text{mad\_vaccount}}\)
vaccount_communeis vaccination count for each communemedian_vaccountis median vaccination count per commune for the corresponding districtmad_vaccountis the median absolute deviation per commune for the corresponding district
Additional computed metric:
- Compute proportion of vaccination count of each commune relative to it’s district i.e. \(\frac{\text{total vaccination of commune}}{\text{total vaccination of district}}\)
Vaccount for a commune is considered outlier if
\(\text{std\_diff\_fr\_mean} < \text{qnorm (0.2, mean = mean(std\_diff\_fr\_mean), sd = sd(std\_diff\_fr\_mean))}\)
OR \(\text{std\_diff\_fr\_median} < \text{qnorm (0.2, mean = mean(std\_diff\_fr\_median), sd = sd(std\_diff\_fr\_median))}\)
Code
filter_loctype <- "ignore"
props_commune_district <- measles %>%
filter(
# keep entries for specified location type
loctype == filter_loctype,
!is.na(VID_1)) %>%
group_by(VID_1, VID_2, VID_3) %>%
summarize(
# summarize to collapse result
vaccount_commune = sum(nshot, na.rm = TRUE)
) %>%
ungroup() %>%
group_by(VID_2) %>% mutate(
vaccount_district = sum(vaccount_commune, na.rm = TRUE),
# median vaccount per commune for each district
median_vaccount = median(vaccount_commune, na.rm = TRUE),
# median absolute deviation
mad_vaccount = mad(vaccount_commune, na.rm = TRUE),
# mean vaccount per district for each province
mean_vaccount = mean(vaccount_commune, na.rm = TRUE),
# compute sd
sd_vaccount = sd(vaccount_commune, na.rm = TRUE)
) %>% ungroup() %>%
mutate(
prop = vaccount_commune/vaccount_district,
ratio_to_median = vaccount_commune/median_vaccount,
# compute standard deviation from mean and median
std_diff_fr_mean = (vaccount_commune - mean_vaccount)/sd_vaccount,
std_diff_fr_median = (vaccount_commune - median_vaccount)/mad_vaccount
) %>%
left_join(
location %>% select(VID_3, province, district, commune) %>% unique(),
by = join_by(VID_3 == VID_3)
)
potential_commune_outliers <- props_commune_district %>%
group_by(VID_2) %>%
filter(
std_diff_fr_median < qnorm(0.2, mean = mean(std_diff_fr_median),
sd = sd(std_diff_fr_median)) |
std_diff_fr_mean < qnorm(0.2, mean = mean(std_diff_fr_mean), sd = sd(std_diff_fr_mean))
) %>%
ungroup() %>%
arrange(VID_2, std_diff_fr_median, std_diff_fr_mean)
potential_commune_outliers %>% relocate(VID_1, VID_2, VID_3, province, district, commune) %>%
arrange(vaccount_commune)Detect potential vaccination count outliers for districts
Similar to the process of detecting communes with low vaccination count, the metrics for detecting outlier districts are as followed
Computed metrics
Metrics to identify outlier district for each province Based on mean:
Based on mean: \(\text{std\_diff\_fr\_mean} = \frac{\text{vaccount\_district - mean\_vaccount}}{\text{sd\_vaccount}}\)
vaccount_districtis vaccination count for each districtmean_vaccountis mean vaccination count per district for the corresponding provincesd_vaccountis standard deviation for vaccination per district for the corresponding province
Based on median: \(\text{std\_diff\_fr\_median} = \frac{\text{vaccount\_district - median\_vaccount}}{\text{mad\_vaccount}}\)
vaccount_districtis vaccination count for each districtmedian_vaccountis median vaccination count per district for the corresponding provincemad_vaccountis the median absolute deviation per district for the corresponding province
Additional computed metric:
- Compute proportion of vaccination count of each commune relative to it’s district i.e. \(\frac{\text{total vaccination of commune}}{\text{total vaccination of district}}\)
District with low vaccination count: Cồn Cỏ, Trường Sa, Bạch Long Vĩ (islands of Vietnam)
Code
# separate aggregate at district level
filter_loctype <- "ignore"
props_district_province <- measles %>%
filter(
# keep entries for specified location type
loctype == filter_loctype,
!is.na(VID_1)) %>%
group_by(VID_1, VID_2) %>%
summarize(
# summarize to collapse result
vaccount_district = sum(nshot, na.rm = TRUE)
) %>%
ungroup() %>%
group_by(VID_1) %>% mutate(
vaccount_province = sum(vaccount_district, na.rm = TRUE),
# median vaccount per district for each province
median_vaccount = median(vaccount_district, na.rm = TRUE),
# median absolute deviation
mad_vaccount = mad(vaccount_district, na.rm = TRUE),
# mean vaccount per district for each province
mean_vaccount = mean(vaccount_district, na.rm = TRUE),
# compute sd
sd_vaccount = sd(vaccount_district, na.rm = TRUE)
) %>% ungroup() %>%
mutate(
prop = vaccount_district/vaccount_province,
ratio_to_median_district = vaccount_district/median_vaccount,
# compute standard deviation from mean and median
std_diff_fr_mean = (vaccount_district - mean_vaccount)/sd_vaccount,
std_diff_fr_median = (vaccount_district - median_vaccount)/mad_vaccount
) %>%
left_join(
location %>% select(VID_2, province, district) %>% unique(),
by = join_by(VID_2 == VID_2)
)
potential_district_outliers <- props_district_province %>%
group_by(VID_1) %>%
filter(
std_diff_fr_median < qnorm(0.2, mean = mean(std_diff_fr_median),
sd = sd(std_diff_fr_median)) |
std_diff_fr_mean < qnorm(0.2, mean = mean(std_diff_fr_mean), sd = sd(std_diff_fr_mean))
) %>%
ungroup() %>%
arrange(VID_1, std_diff_fr_median, std_diff_fr_mean)
potential_district_outliers %>% arrange(vaccount_district) %>%
relocate(VID_1, VID_2, province, district)Geo map
Geo map for vaccount density for each district relative to its province
\[ Prop = \frac{\text{vaccount for district}}{\text{vaccount for the corresponding province}} \]
Code
to_plot <- district_map %>%
left_join(
props_district_province %>%
select(VID_2, district, province, prop) %>%
mutate(label = paste(district, province, sep = "_")),
by = join_by(VID_2 == VID_2)
)
pal <- colorBin(palette = "Blues",
domain = to_plot$prop[to_plot$prop > 0], bin = 8)
labels <- sprintf("<strong> %s </strong> <br/> Prop: %f",
paste(to_plot$province,
to_plot$district),
to_plot$prop) %>%
lapply(htmltools::HTML)
# plot propotion of vaccount
to_plot %>%
leaflet() %>%
addPolygons(color = "grey", weight = 1, label = labels,
fillColor = ~pal(prop), fillOpacity = 0.8, group = "vaccount_prop") %>%
addPolygons(
# add province border
data = province_map,
weight = 1, fill=FALSE, color = "black", group = "vaccount_prop"
) %>%
addLegend(pal = pal, values = ~prop, opacity = 0.5, title = "Prop", position = "bottomright")